"""
/*
 * Copyright 2011 OpenWAF.com
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
 """
import sys
from HTMLParser import HTMLParser


class HTMLView:
    def __init__(self):
        self.rootNode = None
        self.childNodes = []
class HTMLViewNode:
    NODE_ELEMENT =1
    NODE_DATA    =2
    NODE_COMMENT =3
    NODE_DECL    =4
    def __init__(self):
        self.id=None
        self.name = ""
        self.attributes = {}
        self.innerHTML = None
        self.childNodes = []
        self.lineno = -1
        self.offset = -1
        self.nodeType= HTMLViewNode.NODE_ELEMENT
        
    def __str__(self):
        s = str(self.name) + "\n"
        #if self.innerHTML != None:
        #    s += str(self.innerHTML) + " "
        for cnode in self.childNodes:
            s += str(cnode) 
        return s

def readAttributes(attrs):
    nattrs = {}
    for attr in attrs:
        if len(attr) == 2:
            nattrs[attr[0]] = attr[1]
        else:
            nattrs[attr[0]] = None
    return nattrs

class WAFHTMLParser(HTMLParser):
    def __init__(self, filepath, skip_whitespace):
        #print "Reading HTML ",filepath
        #print "Raw HTML",open(filepath).read()
        HTMLParser.__init__(self)
        self.filepath = filepath
        self.curNode = HTMLViewNode()
        self.rootNode = self.curNode
        self.nodeStack = []
        self.nodeStack.append(self.curNode)
        self.skip_whitespace = skip_whitespace
        self.rawbytes=open(filepath).read()
        self.rawlines=self.rawbytes.split("\n")
        self.feed(self.rawbytes)
        

    def handle_starttag(self, tag, attrs):        
        node = HTMLViewNode()
        node.lineno = self.lineno
        node.offset = self.offset
        tag=self.rawlines[self.lineno-1][self.offset+1:self.offset+1+len(tag)].strip()
        node.name = tag
        node.attributes = readAttributes(attrs)
        self.curNode.childNodes.append(node)
        self.nodeStack.append(self.curNode)
        self.curNode = node

    def handle_startendtag(self, tag, attrs):
        node = HTMLViewNode()
        tag=self.rawlines[self.lineno-1][self.offset+1:self.offset+1+len(tag)].strip()
        node.name = tag
        node.attributes = readAttributes(attrs)
        self.curNode.childNodes.append(node)

    def handle_endtag(self, tag):
        #print "ETAG",tag
        tag=self.rawlines[self.lineno-1][self.offset+2:self.offset+2+len(tag)].strip()
        if tag.lower() != self.curNode.name.lower():
            print tag,self.curNode.name
            message=""
            message+="\n===================================="
            message+="\n===================================="
            message+="\nMatching tag not found for tag "+ self.curNode.name+ "("+ str(self.curNode.lineno)+ ","+ str(self.curNode.offset)+ ")\n"
            message+="\n===================================="
            message+="\nError Details "
            message+="\n===================================="
            message+="\nFilepath:"+ self.filepath
            message+="\nLine no :"+str( self.lineno)
            message+="\nOffset  :"+str( self.offset)
            message+="\n===================================="
            raise Exception(message)
           
        node = self.nodeStack.pop()
        self.curNode = node
    def handle_decl(self,data):
        node = HTMLViewNode()
        node.nodeType= HTMLViewNode.NODE_DECL
        node.name = ""
        node.innerHTML = "<!" + data+">"
        self.curNode.childNodes.append(node)      
    def handle_comment(self,data):
        node = HTMLViewNode()
        node.nodeType= HTMLViewNode.NODE_COMMENT
        node.name = ""
        node.innerHTML = "<!--" + data+"-->"
        self.curNode.childNodes.append(node)      
    
    def handle_data(self, data):
        if self.skip_whitespace:
            if len(data.strip()) == 0:
                return
        
        node = HTMLViewNode()
        node.nodeType= HTMLViewNode.NODE_DATA
        node.name = ""
        node.innerHTML = data
        self.curNode.childNodes.append(node)

    def handle_entityref(self, name):
        node = HTMLViewNode()
        node.name = ""
        node.innerHTML = "&" + name + ";"
        self.curNode.childNodes.append(node)

def countHTMLNodes(nodes):
    count = 0
    for node in nodes:
        if node.innerHTML == None:
            count += 1
    return count
            


def parseHTML(filepath, skip_whitespace=False):
    w = WAFHTMLParser(filepath, skip_whitespace)    
    h = HTMLView()
    h.rootNode = w.rootNode
    h.childNodes = h.rootNode.childNodes
    return h

        
        
if __name__ == "__main__":
    w = WAFHTMLParser(sys.argv[1],True)
    print w.curNode
